# replication script 
# reny and newman
# protecting the right to discriminate
# apsr
# June 2018

# install any packages needed before running this
library(tidyverse)
library(stargazer)
library(betareg)
library(RColorBrewer)
library(simcf)
library(MASS)
library(eiCompare)
library(rgdal)
library(sp)
library(rgeos)
library(gridExtra)
library(lmtest)

# set color palette
colors <- brewer.pal(10, 'RdGy')

# load up helper functions
rse <- function(x){
  return(sqrt(diag(vcovHC(x, type='HC1'))))
}

bpm <- function(x){
  n <- length(x)
  res <- rep(NA,n)
  for(i in 1:n){
    res[i] <- bptest(x[[i]])$p.value
  }
  return(res)
}

gen_xtabs_factor <- function(vars, data, margins = TRUE, percent = TRUE) {
  if(!margins) {
    if(class(data)[1] == "survey.design2") {
      tabs <- as.data.frame(svytable(as.formula(paste0("~", paste(vars, collapse = " + "), " - 1")), data))
    } else {
      warning("Sample object must be a survey.design object.")
      break
    }
    if(percent) tabs$Freq = tabs$Freq / sum(tabs$Freq)
  } else {
    if(class(data)[1] == "survey.design2") {
      tabs = NULL
      for(var in vars) {
        summary <- as.data.frame(svymean(as.formula(paste0("~", var, " - 1")), data))
        temp_tabs <- as.data.frame(t(summary$mean))
        names(temp_tabs) <- rownames(summary)
        if(var != vars[1] & length(vars) > 1) {
          temp_tabs <- temp_tabs[-1]
        }
        tabs <- c(tabs, temp_tabs)
      }
      tabs <- as.data.frame(tabs)
    } else {
      warning("Population object must be a survey.design object.")
      break
    }
    if(!percent) tabs <- tabs * nrow(data)
  }
  if(margins) {
    tabs <- t(tabs)
  }
  return(tabs)
}
clusterRobustSE <- function(model,cluster){
  
  #install.packages() if you don't have
  require(sandwich)
  require(lmtest)
  
  # degree of freedom correction
  M <- length(unique(cluster))
  N <- length(cluster)
  K <- model$rank
  dfc <- (M/(M - 1)) * ((N - 1)/(N - K))
  
  # adjust standard errors using cluster
  vcovAdj <- dfc * vcovHC(model, type = "HC0", cluster = cluster, adjust = T)
  
  return(list(pe=coeftest(model, vcov = vcovAdj),
              vcov=vcovAdj))
}

# read in data

#setwd() #set to appropriate directory where you downloaded data
setwd('/Users/tylerreny/Dropbox/Reny_Newman/Prop 14 Project/writing/submission APSR/conditional_accept/replication')
df <- read_csv('clean_data_final_reny_newman.csv')

# Table 1 Main Models -----------------------------------------------------------------

bivariate_form <- pct_14_yes ~ dist_black_98_percentile
full_model_form <- pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden
cube_form <- pct_14_yes ~ dist_black_98_percentile + proxsquare + proxcube + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden
log_form <- pct_14_yes ~ log_prox + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden
out1 <- lm(bivariate_form, data=df[df$pct_white_1960 >= 0.90,])
out2 <- lm(full_model_form,data=df[df$pct_white_1960 >= 0.90,])
out3 <- lm(cube_form ,data=df[df$pct_white_1960 >= 0.90,])
out4 <- lm(log_form ,data=df[df$pct_white_1960 >= 0.90,])

#testing for heteroskedasticity
bpm(list(out1,out2,out3,out4))

stargazer(out1,out2,out3,out4,
          style = 'APSR',
          se=list(rse(out1),rse(out2),rse(out3),rse(out4)),
          covariate.labels = c('Proximity','Proximity Squared','Proximity Cubed',
                               'Log Proximity','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          digits = 2,digits.extra = 1,
          dep.var.labels = 'Prop 14, 1964',
          type = 'html',out='tables/table_1.html')

out2 <- betareg(I(pct_14_yes/100) ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.90,])
stargazer(out2,
          style = 'APSR',
          covariate.labels = c('Proximity','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          digits = 2,digits.extra = 1,
          dep.var.labels = 'Prop 14, 1964',
          type = 'html',out='tables/table_D1.html')

# Figure 2 Predicted values and first differences -----------------------------------------------

out <- lm(full_model_form,data=df[df$pct_white_1960 >= 0.90,])

# extract coefficinets and vcov matrix
pe <- coef(out)
vc <- vcovHC(out, type='HC1')
set.seed(123)
simbetas <- MASS::mvrnorm(1000,pe,vc)

# establish counterfactual values
lower <- mean(df$dist_black_98_percentile, na.rm=T) - 2*sd(df$dist_black_98_percentile, na.rm=T)
xhyp <- seq(lower, 0, length.out = 100)

# make model matrix
mm <- cfMake(full_model_form, df[df$pct_white_1960 >= 0.90,], nscen=100)

# set proximity to counterfactual values
mm <- cfChange(mm, 'dist_black_98_percentile', x=xhyp, scen=1:100)

# calculate predicted values w 95% CIs using simulation
out <- linearsimev(mm, simbetas, 0.95)

#first difference
mm2 <- cfMake(full_model_form, df[df$pct_white_1960 >= 0.90,], nscen=1)
mm2 <- cfChange(mm2, 'dist_black_98_percentile', xpre=xhyp[1],x=xhyp[100], scen=1)
linearsimfd(mm2, simbetas, 0.95)

# shape into dataframe for plotting
dat <- data.frame(x=xhyp, y=out$pe,lower=out$lower, upper=out$upper)
g1 <- ggplot(dat, aes(x,y)) +
  geom_line() +
  geom_line(aes(x,upper),linetype=2) +
  geom_line(aes(x, lower), linetype=2) +
  scale_y_continuous(limits=c(50,80)) +
  theme_bw() +
  labs(
    y='Predicted Vote Prop 14 (%)',
    x='Prox Growing Black Cities\n(miles)'
  ) +
  geom_rug(data=df[df$pct_white_1960 >= 0.9,], 
           aes(pct_14_yes, x=dist_black_98_percentile),sides = 'b')+
  scale_x_continuous(limits=c(lower,0),
                     labels=c(-200,-150,-100,-50,0))

# cubic specification
out2 <- lm(cube_form,data=df[df$pct_white_1960 >= 0.90,])
pe <- coef(out2)
vc <- vcovHC(out2, type='HC1')
set.seed(123)
simbetas <- MASS::mvrnorm(1000,pe,vc)

# set counterfactuals
mm <- cfMake(cube_form, df[df$pct_white_1960 >= 0.90,], nscen=100)
mm <- cfChange(mm, 'dist_black_98_percentile', x=xhyp, scen=1:100)
mm <- cfChange(mm, 'proxsquare', x=xhyp^2, scen=1:100)
mm <- cfChange(mm, 'proxcube', x=xhyp^3, scen=1:100)

mm2 <- cfMake(cube_form, df[df$pct_white_1960 >= 0.90,], nscen=1)
mm2 <- cfChange(mm2, 'dist_black_98_percentile', xpre=xhyp[61], x=xhyp[100], scen=1)
mm2 <- cfChange(mm2, 'proxsquare', xpre=xhyp[61]^2, x=xhyp[100]^2 ,scen=1)
mm2 <- cfChange(mm2, 'proxcube', xpre=xhyp[61]^3, x=xhyp[100]^3, scen=1)

mm3 <- cfMake(cube_form, df[df$pct_white_1960 >= 0.90,], nscen=1)
mm3 <- cfChange(mm3, 'dist_black_98_percentile', xpre=xhyp[1], x=xhyp[100], scen=1)
mm3 <- cfChange(mm3, 'proxsquare', xpre=xhyp[1]^2, x=xhyp[100]^2 ,scen=1)
mm3 <- cfChange(mm3, 'proxcube', xpre=xhyp[1]^3, x=xhyp[100]^3, scen=1)

# predicted values
out <- linearsimev(mm, simbetas, 0.95)
linearsimfd(mm2, simbetas, 0.95) # first diff estimates
linearsimfd(mm3, simbetas, 0.95)

dat <- data.frame(x=xhyp, y=out$pe,lower=out$lower, upper=out$upper)
g2 <- ggplot(dat, aes(x,y)) +
  geom_line() +
  geom_line(aes(x,upper),linetype=2) +
  geom_line(aes(x, lower), linetype=2) +
  scale_y_continuous(limits=c(50,80)) +
 theme_bw() +
  labs(
    y='',
    x='Prox Growing Black Cities\n(miles)'
  ) +
  geom_rug(data=df[df$pct_white_1960 >= 0.9,], 
           aes(pct_14_yes, x=dist_black_98_percentile),sides = 'b')+
  scale_x_continuous(limits=c(lower,0),
                     labels = c(-200, -150, -100, -50, 0))


# log specification
pe <- coef(out4)
vc <- vcovHC(out4,type='HC1')
set.seed(123)
simbetas <- MASS::mvrnorm(1000,pe,vc)

# set counterfactuals
xhyp <- seq(mean(df$log_prox,na.rm=T) - 2*sd(df$log_prox, na.rm=T),0,length.out = 100)
mm <- cfMake(log_form, df[df$pct_white_1960 >= 0.90,], nscen=100)
mm <- cfChange(mm, 'log_prox', x=xhyp, scen=1:100)
out <- linearsimev(mm, simbetas, 0.95)
dat <- data.frame(x=xhyp, y=out$pe,lower=out$lower, upper=out$upper)
g3 <- ggplot(dat, aes(x,y)) +
  geom_line() +
  geom_line(aes(x,upper),linetype=2) +
  geom_line(aes(x, lower), linetype=2) +
  scale_y_continuous(limits=c(50,80)) +
 theme_bw() +
  labs(
    y='',
    x='Prox Growing Black Cities\n(miles)'
  ) +
  geom_rug(data=df[df$pct_white_1960 >= 0.9,], 
           aes(pct_14_yes, x=log_prox),sides = 'b') +
scale_x_continuous(limits=c(xhyp[1],0),
                   breaks=c(log(c(2,1.5,1,.5,0)+1)*-1),
                   labels = c(-200, -150, -100, -50, 0))

# first diff
mm <- cfMake(log_form, df[df$pct_white_1960 >= 0.90,], nscen=1)
mm <- cfChange(mm, 'log_prox', xpre=xhyp[1],x=xhyp[100], scen=1)
linearsimfd(mm, simbetas, 0.95)

#combine
grob <- grid.arrange(g1, g2, g3,ncol=3)
ggsave(grob,width=7.5, height=2.5, dpi=1000, 
       file='figures/figure_2.eps')

# Change in White Pop -----------------------------------------------------

temp <- df
temp$dist <- NA
temp$dist[temp$dist_black_98_percentile > -0.05] <- 'less than 5'
temp$dist[temp$dist_black_98_percentile <= -0.05 & temp$dist_black_98_percentile > -0.10] <- '5 to 10'
temp$dist[temp$dist_black_98_percentile <= -0.10 & temp$dist_black_98_percentile > -0.20] <- '10 to 20'
temp$dist[temp$dist_black_98_percentile <= -0.20 & temp$dist_black_98_percentile > -0.30] <- '20 to 30'
temp$dist[temp$dist_black_98_percentile <= -0.30 & temp$dist_black_98_percentile > -0.50] <- '30 to 50'
temp$dist[temp$dist_black_98_percentile <= -0.50 & temp$dist_black_98_percentile > -1] <- '50 to 100'
temp$dist[temp$dist_black_98_percentile <= -1 & temp$dist_black_98_percentile > -2] <- '100 to 200'
temp$dist[temp$dist_black_98_percentile <= -2] <- '200 to 400'
temp$dist <- factor(temp$dist,levels=c('less than 5','5 to 10','10 to 20','20 to 30','30 to 50',
                          '50 to 100','100 to 200','200 to 400'))


#change white pop
g.out <- temp %>%
  group_by(dist) %>%
  mutate(white_grow = pct_white_1960 - pct_white_1940) %>%
  summarise(white_grow_mean = mean(white_grow,na.rm=T),
            lower=white_grow_mean-sqrt(var(white_grow,na.rm=T)),
            upper=white_grow_mean+sqrt(var(white_grow,na.rm=T))) %>%
  na.omit() %>%
  ggplot(aes(dist,white_grow_mean)) + geom_point() +
  geom_errorbar(aes(x=dist,ymin=lower,ymax=upper),width=.1) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x='Proximity to Nearest Black Growth City',
       y="Mean White Population Change\n(1940-1960)") +
  geom_hline(yintercept = 0,linetype=2,alpha=.5,color='red') + theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
g.out
ggsave(g.out,width=4,height=3,dpi=1000,file=
         'figures/figure_L1.eps')


# Robustness Distance Measures --------------------------------------------

summary(m1 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960>=0.90,]))
summary(m2 <- lm(pct_14_yes ~ dist_black_95_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960>=0.90,]))
summary(m3 <- lm(pct_14_yes ~ dist_black_90_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960>=0.90,]))
summary(m4 <- lm(pct_14_yes ~ dist_black_85_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden, data=df[df$pct_white_1960>=0.90,]))
bpm(list(m1,m2,m3,m4))
stargazer(m1,m2,m3,m4,
          se=list(rse(m1),rse(m2),rse(m3),rse(m4)),
          type='html',style = 'APSR',
          covariate.labels = c('Proximity 98','Proximity 95','Proximity 90','Proximity 85',
                               'Median Income','Unemployment','Homeownership',
                               'Partisan Composition', 'Population Density'),
          dep.var.labels = 'Prop 14, 1964', 
          out='tables/table_G1.html')

# Housing Tenure/Other Split Samples --------------------------------------

# home ownership and white growth
median_home_owner_pct <- median(df$pct_home_before_40,na.rm=T)
median_white_growth <- median(df$pct_white_growth,na.rm=T)

# 1940 covariates
# rescaled to be consistent with 1960 data
df$pct_owner_occupied_40 <- with(df,ownocc40trc/df$hunits40trc.x*100)
df$pct_unemployed_1940 <- (df$unempers40trc/df$totpop40trc)

m1 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.9 & df$pct_home_before_40_oc < median_home_owner_pct ,])
m2 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.9 & df$pct_home_before_40_oc > median_home_owner_pct,])
m3 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_growth < median_white_growth & df$pct_white_1960 >= 0.9,])
m4 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_growth > median_white_growth & df$pct_white_1960 >= 0.9,])
m5 <- lm(pct_14_yes ~ dist_black_98_percentile + dist_housing_vacancy + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.90,])
m6 <- lm(pct_14_yes ~ dist_black_98_percentile + change_housing_values + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.90,])

#testing for heteroskedasticity
bpm(list(m1,m2,m3,m4,m5,m6)) #note model 1

out <- data.frame(labels=c('Acquired Before 1940 - Below Median','Acquired Before 1940 - Above Median',
                      'White Growth - Below Median','White Growth - Above Median',
                      'Control - Contracting Housing','Control - Inflating Home Values'),
coef=as.numeric(c(coef(m1)[2],coef(m2)[2],
                  coef(m3)[2],coef(m4)[2],
                  coef(m5)[2],coef(m6)[2])),
se=c(sqrt(vcov(m1)[2,2]),rse(m2)[2],
     rse(m3)[2],rse(m4)[2],
     rse(m5)[2],rse(m6)[2]),
facet=c('Acquired Before 1940','Acquired Before 1940',
        'White Growth','White Growth',
        'Control Housing','Control Housing'))
out$labels <- factor(out$labels,levels=rev(c('Acquired Before 1940 - Below Median','Acquired Before 1940 - Above Median',
                      'White Growth - Below Median','White Growth - Above Median',
                      'Control - Contracting Housing','Control - Inflating Home Values')))
out$facet <- factor(out$facet, levels=c('Acquired Before 1940','White Growth','Control Housing'))

g.out <- out %>%
  ggplot(aes(labels,coef)) + geom_point() +
  geom_errorbar(aes(x=labels,ymin=coef-se,ymax=coef+se),width=.1) +
  geom_hline(yintercept=0,linetype=2,alpha=.5,color='red') +
  theme_bw() +
  coord_flip() +
  labs(x='',y='Proximity Coefficient') + 
  facet_grid(facet~.,scales='free_y',
             space='free_y') +
  theme(strip.background = element_blank(),
   strip.text.y = element_blank()) +
  scale_y_continuous(limits=c(0,10.5))
g.out
ggsave(g.out,width=6,height=2,dpi=1000,file=
         'figures/figure_3.eps')

stargazer(m1,m2,m3,m4,m5,m6,
          se=list(sqrt(diag(vcov(m1))),rse(m2),rse(m3),rse(m4),rse(m5),rse(m6)),
          type='html',style = 'APSR',
          covariate.labels = c('Proximity','Contracting Housing','Home Values',
                               'Median Income','Unemployment','Homeownership',
                               'Median Home Values','Unemployment','Homeownership',
                               'Partisan Composition','Population Density',
                               'Population Density'),
          digits = 2,digits.extra = 1,
          column.labels = c('< Median Res Tenure','> Median Res Tenure',
                            '< Median White Grow','> Median White Grow',
                            'Prox Contracting Housing','Prox Rising Home Values'),
          dep.var.labels = 'Prop 14, 1964', 
          out='tables/table_K1.html')


# Descriptives ------------------------------------------------------------

# Descriptives main IV
g.out <- ggplot(df[df$pct_white_1960 >= 0.9,],aes(dist_black_98_percentile)) + 
  geom_histogram(fill='white',color='black') + theme_bw() +
  labs(x='Proximity to Black Growth City (100 Miles)\n98th Percentile',
       y='')
g.out
ggsave(g.out,width=4,height=3,dpi=1000,file='figures/figure_C2.eps')

# Descriptives main DV
g.out <- ggplot(df[df$pct_white_1960 >= 0.9,],aes(pct_14_yes)) + 
  geom_histogram(fill='white',color='black') + theme_bw() +
  labs(x='Percent Yes Proposition 14',
       y='')
g.out
ggsave(g.out,width=4,height=3,dpi=1000,file='figures/figure_C1.eps')

df %>%
  filter(pct_white_1960 >= 0.9) %>%
  summarise(Income=mean(median_income,na.rm=T),
            Unemploy=mean(pct_unemployed_1960,na.rm=T),
            OwnerOcc=mean(pct_owner_occupied,na.rm=T),
            Dem=mean(pct_dem_1964,na.rm=T),
            PopDen=mean(popden,na.rm=T),
            )
df %>%
  filter(pct_white_1960 >= 0.9) %>%
  summarise(Income=sd(median_income,na.rm=T),
            Unemploy=sd(pct_unemployed_1960,na.rm=T),
            OwnerOcc=sd(pct_owner_occupied,na.rm=T),
            Dem=sd(pct_dem_1964,na.rm=T),
            PopDen=sd(popden,na.rm=T),
            )


# 1940 Covariates ---------------------------------------------------------


m1 <- lm(pct_14_yes ~ dist_black_98_percentile + pct_dem_1964 + I(popden40trc/1000) + pct_owner_occupied_40 + I(medvalue40trc/1000) + pct_unemployed_1940,data=df[df$pct_white_1960 >= 0.9,])
bptest(m1)
stargazer(m1,type='html',style = 'APSR',
          covariate.labels = c('Proximity','Partisan Composition',
                               'Population Density','Homeownership',
                               'Median Home Values','Unemployment'),
          column.labels = c('Prop 14, 1964'),
          dep.var.labels.include = F, out='tables/table_H1.html')


# Placebos ----------------------------------------------------------------

out1 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.90,])
out2 <- lm(pct_prop_1_66_yes*100 ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.90,])
out3 <- lm(pct_prop_16_66_yes*100 ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df[df$pct_white_1960 >= 0.90,])

bpm(list(out1,out2,out3))

stargazer(out1,out2,out3,
          se=list(rse(out1),rse(out2),rse(out3)),
          style = 'APSR',
          covariate.labels = c('Proximity','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          column.labels = c('Prop 14, 1964',
                            'Prop 1, 1966', 'Prop 16, 1966'),
          digits = 2,digits.extra = 1,
          dep.var.labels.include = F,
          type = 'html',out='tables/table_I1.html')

# EI Analyses -------------------------------------------------------------

#create non-white binary
df$pct_nonwhite_1960 <- 1 - df$pct_white_1960

#clean listwise deleted dataset
dat_ei <- simcf::extractdata(pct_14_yes ~ pct_white_1960 + total_votes + order + pct_nonwhite_1960,df,na.rm = T)

# dv needs to range between 0 and 1
dat_ei$pct_14_yes <- dat_ei$pct_14_yes/100 

# run ei
table_names <- c("EI: Pct White","EI: Pct Nonwhite")
ei.out <- ei_est_gen(cand_vector = 'pct_14_yes',
                     race_group = c('~pct_white_1960','~pct_nonwhite_1960'),
                     total="total_votes", data=dat_ei,
                     table_names = table_names, beta_yes=T)

dat_ei$est_white_vote <- ei.out[[2]][,1]
df <- left_join(df, dat_ei[,c('order','est_white_vote')], by='order')

# run model now without sample restrictions
out1 <- lm(est_white_vote*100 ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden,data=df)
bptest(out1)
stargazer(out1, style = 'APSR',
          se=list(rse(out1)),
          covariate.labels = c('Proximity','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          column.labels = c('Prop 14, 1964'),
          digits = 2,digits.extra = 1,
          dep.var.labels.include = F,
          type = 'html',out='tables/table_F1.html')


# Sample Restrictions -----------------------------------------------------

include <- extractdata(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden, df,na.rm=T)
include <- as.numeric(rownames(include))

out1 <- lm(bivariate_form, data=df)
out2 <- lm(bivariate_form, data=df[include,])
out3 <- lm(bivariate_form, data=df[-include,])
out4 <- lm(full_model_form, data=df)
out5 <- lm(full_model_form, data=df[df$pct_white_1960 >= 0.90,])
out6 <- lm(full_model_form, data=df[df$pct_white_1960 >= 0.95,])

bpm(list(out1,out2,out3,out4,out5,out6))
stargazer(out1,out2,out3,out4,out5,out6,
          se=list(rse(out1),rse(out2),rse(out3),rse(out4),rse(out5),rse(out6)),
          style = 'APSR',
          covariate.labels = c(
            'Proximity','Median Income','Unemployment','Homeownership',
            'Partisan Composition (%D)','Population Density'
            ),
          dep.var.labels = 'Prop 14, 1964',
          column.labels = c('Full','Full Control Cities','Full Excluded Cities','Full Controls','90% Controls','95% Controls'),
          type = 'html',out='tables/table_E1.html')


# Driving Distance and Times ----------------------------------------------

#models
out1 <- lm(pct_14_yes ~ dist_black_98_percentile + pct_dem_1964 + popden + pct_owner_occupied + median_income + pct_unemployed_1960,data=df[df$pct_white_1960 >= 0.90,])
out2 <- lm(pct_14_yes ~ drive_distance + pct_dem_1964 + popden + pct_owner_occupied + median_income + pct_unemployed_1960,data=df[df$pct_white_1960 >= 0.90,])
out3 <- lm(pct_14_yes ~ travel_time + pct_dem_1964 + popden + pct_owner_occupied + median_income + pct_unemployed_1960,data=df[df$pct_white_1960 >= 0.90,])

bpm(list(out1,out2,out3))

stargazer(out1,out2,out3, 
          se=list(rse(out1),rse(out2),rse(out3)),
          style = 'APSR',
          covariate.labels = c('Proximity','Driving Distance','Travel Time','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          dep.var.labels = c('Vote to Prop 14'),
          type = 'html',out='tables/table_G2.html')

cor(cbind(df$drive_distance, df$travel_time, df$dist_black_98_percentile), use='complete.obs')

# Fixed Effects -----------------------------------------------------------

out1 <- lm(pct_14_yes ~ dist_black_98_percentile + median_income + pct_unemployed_1960 + pct_owner_occupied + pct_dem_1964 + popden + as.factor(county),data=df[df$pct_white_1960 > 0.90,])
stargazer(out1,
          se=list(rse(out1)),
          style = 'APSR',
          covariate.labels = c('Proximity','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          column.labels = c('Prop 14, 1964'),
          digits = 2,digits.extra = 1,
          dep.var.labels.include = F,
          add.lines = list(c("Fixed effects?", "Yes")),
          omit = "as.factor",
          type = 'html',
          out='tables/table_D2.html')

# Include/Exclude ---------------------------------------------------------

include90 <- which(df$pct_white_1960 >= 0.9)
exclude90 <- (1:nrow(df))[!1:nrow(df) %in% include90]
include95 <- which(df$pct_white_1960 >= 0.95)
exclude95 <- (1:nrow(df))[!1:nrow(df) %in% include95]

length(include90)
length(exclude90)
length(include95)
length(exclude95)

mean(df$dist_black_98_percentile,na.rm=T)
mean(df$dist_black_98_percentile[include90],na.rm=T)
mean(df$dist_black_98_percentile[include95],na.rm=T)
mean(df$dist_black_98_percentile[exclude90],na.rm=T)
mean(df$dist_black_98_percentile[exclude95],na.rm=T)

dat1 <- df[exclude90,c('dist_black_98_percentile', 'pct_14_yes','pct_white_1960','city')]
dat2 <- df[exclude95,c('dist_black_98_percentile', 'pct_14_yes','pct_white_1960','city')]
dat1$restriction <- "Excluded 90% Cutoff"
dat2$restriction <- "Excluded 95% Cutoff"
dat <- bind_rows(dat1, dat2)

g.out <- ggplot(dat, aes(dist_black_98_percentile, pct_14_yes)) +
  geom_point() +
  theme_bw() +
  facet_wrap(~restriction) +
    labs(
    y='Predicted Vote Prop 14 (%)',
    x='Prox Growing Black Cities\n(100 miles)'
  )
g.out
ggsave(g.out,width=5,height=2.5,dpi=1000,
       file='figures/figure_E1.eps')

# Different Distance Restrictions -----------------------------------------

summary(out1 <- lm(full_model_form,data=df[df$pct_white_1960 >= 0.90 & df$dist_black_98_percentile > -2,]))
summary(out2 <- lm(full_model_form,data=df[df$pct_white_1960 >= 0.90 & df$dist_black_98_percentile > -1,]))
summary(out3 <- lm(full_model_form,data=df[df$pct_white_1960 >= 0.90 & df$dist_black_98_percentile > -.75,]))
summary(out4 <- lm(full_model_form,data=df[df$pct_white_1960 >= 0.90 & df$dist_black_98_percentile > -.50,]))
bpm(list(out1,out2,out3,out4))

stargazer(out1,out2,out3,out4,
          se=list(rse(out1),rse(out2),rse(out3),rse(out4)),
          style = 'APSR',
          covariate.labels = c('Proximity','Median Income','Unemployment','Homeownership',
                               'Partisan Composition (%D)','Population Density'),
          digits = 2,digits.extra = 1,
          dep.var.labels = 'Prop 14, 1964',
          type = 'html',out='tables/table_N1.html')

# Land Area ---------------------------------------------------------------

lmass <- read_csv('land_mass_1960.csv')
df <- left_join(df, lmass, by=c('county','city'))
medmile <- median(df$miles,na.rm=T)
summary(out1 <- lm(pct_14_yes ~ dist_black_98_percentile, data=df[df$pct_white_1960 >= 0.90 & df$miles < medmile,]))
out2 <- lm(pct_14_yes ~ dist_black_98_percentile, data=df[df$pct_white_1960 >= 0.90 & df$miles > medmile,])
bpm(list(out1,out2))
stargazer(out1,out2,
          se=list(rse(out1),rse(out2)),
          style = 'APSR',
          covariate.labels = c('Proximity'),
          digits = 2,digits.extra = 1,
          dep.var.labels = 'Prop 14, 1964',
          type = 'html',out='tables/table_B1.html')

# Field Polls -------------------------------------------------------------

library(haven)
library(rgeos)
library(rgdal)
library(mgcv)
library(tidyverse)
library(survey)
library(lme4)
library(MASS)
library(stringr)
library(survey)
library(jtools)
library(stargazer)
library(rms)

####
# codebooks http://sda.berkeley.edu/sdadata/calpolls/cal6402/Doc/calp.htm
####

#read in files
df6405 <- read_dta('ca_field_6405.dta')
df6406 <- read_dta('ca_field_6406.dta')
df6407 <- read_dta('ca_field_6407.dta')

# recode education
df6405$educ <- df6405$p2
df6405$college <- ifelse(df6405$educ > 4,1,0)
df6406$educ <- df6406$case2
df6406$college <- ifelse(df6406$educ > 4,1,0)
df6407$educ <- df6407$p2
df6407$college <- ifelse(df6407$educ > 4,1,0)

# recode age
df6405$age <- df6405$p3
df6406$age <- df6406$region2
df6407$age <- df6407$p3

# recode gender
df6405$female <- df6405$p10
df6406$female <- df6406$p6
df6407$female <- df6407$p8 

# recode race
df6405$white <- df6405$p11
df6406$white <- df6406$p7
df6407$white <- df6407$p9

# recode income
df6405$income <- df6405$p12
df6406$income <- df6406$p8
df6407$income <- df6407$p10

# recode county
df6405$county <- df6405$p14
df6405$county <- as.character(as_factor(df6405$county))
attr(df6406$p11, which = 'labels') <- attr(df6406$p15, which = 'labels')
df6406$county <- as.character(as_factor(df6406$p11))
df6407$county <- as.character(as_factor(df6407$p12))

# recode homeownership 
df6405$p1[df6405$p1=='NaN'] <- NA
df6405$homeowner <- as.factor(df6405$p1)
df6406$poll2[df6406$poll2=='NaN'] <- NA
df6406$homeowner <- as.factor(df6406$poll2)
df6407$p1[df6407$p1=='NaN'] <- NA
df6407$homeowner <- as.factor(df6407$p1)

####
# Merge datasets and recode DV
####

df6405 <- df6405 %>% 
  mutate(q16 = as.numeric(q16),
         yes_prop14 = case_when(
           q16 == 1 ~ 1,
           q16 == 2 ~ -1,
           q16 == 3 ~ 0,
           q16 == 4 ~ 0,
           T ~ q16))
table(df6405$yes_prop14)
df6406 <- df6406 %>% 
  mutate(cq5 = as.numeric(cq5),
         yes_prop14 = case_when(
           cq5 == 1 ~ 1,
           cq5 == 2 ~ -1,
           cq5 == 3 ~ 0,
           T ~ cq5))
table(df6406$yes_prop14)
df6407 <- df6407 %>% 
  mutate(q6 = as.numeric(q6),
         yes_prop14 = case_when(
           q6 == 1 ~ 1,
           q6 == 2 ~ -1,
           q6 == 3 ~ 0,
           q6 == 4 ~ 0,
           T ~ q6))

# select just key sample vars

out1 <- df6405 %>% 
  filter(white==1) %>%
  dplyr::select(age, female, yes_prop14,county,homeowner,college,income)%>%
  mutate(survey='6405')
out2 <- df6406 %>% 
  filter(white==1) %>%
  dplyr::select(age, female, yes_prop14,county,homeowner,college,income)%>%
  mutate(survey='6406')
out3 <- df6407 %>% 
  filter(white==1) %>%
  dplyr::select(age, female, yes_prop14,county,homeowner,college,income)%>%
  mutate(survey='6407')

# bind rows
survey <- bind_rows(out1,out2,out3)

# DVs
survey$yes_prop14_ord <- survey$yes_prop14
survey$yes_prop14_di <- survey$yes_prop14
survey$yes_prop14_di[survey$yes_prop14 == -1] <- 0

# clean additional survey variables
survey$female[survey$female=='NaN'] <- NA
survey$female <- as.factor(survey$female)
table(survey$female)

survey$age[survey$age=='NaN'] <- NA
survey$age <- as.factor(survey$age)
table(survey$age)

survey$county[survey$county =='alameda county'] <- 'alameda'
survey$county[survey$county =='monterey county'] <- 'monterey'
survey$county[survey$county =='NaN'] <- NA
survey$county[survey$county =='san mated'] <- 'san mateo'
survey$county[survey$county =='soland'] <- 'solano'

#survey college
survey$college <- as.factor(survey$college)

####
# treated counties based on previous analysis
####

treated_counties <- c('alameda', 'los angeles',
                      'contra costa','solano')

####
# Read in shape file
####

#read in cities shape file
shapefile <- readOGR("ca_county_shp", layer = "CA_counties") # Read in world shapefile

#calculate centroids
centroids <- gCentroid(shapefile, byid = TRUE)
dist.matrix <- spDists(centroids, longlat=TRUE)  #distance in km

countynames <- tolower(gsub(' city', '',shapefile@data$NAME))

#names for matrix
rownames(dist.matrix) <- colnames(dist.matrix) <- countynames

#construct distance matrix
temp <- dist.matrix[,treated_counties]
min_dist <- apply(temp,1,function(x) min(x,na.rm=T)) #take min
dist.mat <- data.frame(county=rownames(dist.matrix),min_dist) 
dist.mat <- na.omit(dist.mat)

#kms to miles
kms_to_miles <- 0.621371
dist.mat$min_dist <- dist.mat$min_dist * -1 * kms_to_miles

# merge back in
survey <- survey %>%
  left_join(dist.mat,by='county')

# calculate sample size for each county
n_county <- survey %>%
  group_by(county) %>%
  summarise(n()) %>%
  as.data.frame()

#append county vars 
bp40 <- read_csv('black_pop_county_ca_1940.csv')
bp60 <- read_csv('black_pop_county_ca_1960.csv')
bp40$county <- tolower(bp40$county)
bp60$county <- tolower(gsub(' County','',bp60$county))
bp <- left_join(bp40,bp60,by='county')
bp$grow_black_pop <- (bp$black/bp$total_pop) - (bp$black_1940/bp$total_pop_1940)

cd40 <- read_csv('county_demos_1940.csv')
cd40$county <- tolower(cd40$county)
names(cd40) <- c('county','popden_county','unemployed_county','college_county','homeowner_county')
cd40$popden_county <- cd40$popden_county/1000

survey <- left_join(survey, bp[,c('county','grow_black_pop')], by='county') %>%
  left_join(cd40, by='county')

###
# Population Data
###

census <- read_csv('census_age_gender.csv')
educ <- read_csv('ca_census_tract_education.csv')

#fix county labels
educ$county <- sapply(str_split(educ$county,', '), 
                      function(x) x[2])

# aggregate up education
cty <- educ %>%
  group_by(county) %>%
  summarise(total_pop = sum(total_white),
            no_school = sum(no_school),
            elementary = sum(elementary),
            high_school = sum(high_school),
            college = sum(college)) %>%
  right_join(census, by='county')

#calculate percentages
cty$pct_male <- cty$male/(cty$male + cty$female)
cty$pct_female <- 1 - cty$pct_male
cty$pct_college <- cty$college / cty$total_pop
cty$total_pop_over21 <- (cty$age_21_29 + cty$age_30_39 + cty$age_40_49 + cty$age_50_59 + cty$age_60_69 + cty$age_over_70)
cty$pct_age_21_29 <- cty$age_21_29/cty$total_pop_over21
cty$pct_age_30_39 <- cty$age_30_39/cty$total_pop_over21
cty$pct_age_40_49 <- cty$age_40_49/cty$total_pop_over21
cty$pct_age_50_59 <- cty$age_50_59/cty$total_pop_over21
cty$pct_age_60_69 <- cty$age_60_69/cty$total_pop_over21

#percents
sum(cty$college,na.rm=T)/sum(cty$college ,cty$high_school,cty$elementary ,cty$no_school, na.rm=T)

#for total pop
pop <- census %>%
  summarise(male=sum(male),female=sum(female),total_pop=male+female,
            total_pop_over21=sum(age_21_29 + age_30_39 + age_40_49 + age_50_59 + age_60_69 + age_over_70),
            age_21_29=sum(age_21_29),
            age_30_39=sum(age_30_39),
            age_40_49=sum(age_40_49),
            age_50_59=sum(age_50_59),
            age_60_69=sum(age_60_69),
            age_over_70=sum(age_over_70))
pop$pct_male <- pop$male/pop$total_pop
pop$pct_female <- pop$female/pop$total_pop
pop$pct_age_21_29 <- pop$age_21_29/pop$total_pop_over21
pop$pct_age_30_39 <- pop$age_30_39/pop$total_pop_over21
pop$pct_age_40_49 <- pop$age_40_49/pop$total_pop_over21
pop$pct_age_50_59 <- pop$age_50_59/pop$total_pop_over21
pop$pct_age_60_69 <- pop$age_60_69/pop$total_pop_over21
pop$pct_age_over_70 <- pop$age_over_70/pop$total_pop_over21

#pop density
pd <- read_csv('pop_density.csv')
census <- census %>% left_join(pd,by='county')

####
# Raking weights
####

samp_design_srs <- survey %>%
  dplyr::select(female,age,county,yes_prop14,yes_prop14_di,yes_prop14_ord,homeowner,college,survey,income,min_dist,
                popden_county,unemployed_county,college_county,homeowner_county) %>%
  na.omit() %>%
  svydesign(~1, data=.)

weight_vars <- c('female','age','homeowner', 'college')

#specify formula for auxiliary vector
form <- as.formula(paste0("~", paste(weight_vars, collapse = " + "),"-1"))

pop_table <- as.matrix(rbind(pop$pct_male,pop$pct_female,
                             pop$pct_age_30_39,pop$pct_age_40_49,
                             pop$pct_age_50_59,pop$pct_age_60_69,
                             pop$pct_age_over_70,
                             0.416, 0.1245)) #homeownership
rownames(pop_table) <- c('female1',
                         'female2',
                         'age2','age3',
                         'age4','age5',
                         'age6',
                         'homeowner2',
                         'college2')

# calibrate rake weights
samp_rake_srs <-  survey::calibrate(samp_design_srs,form, pop_table,
                                    calfun = "raking", maxit = 1000,
                                    epsilon = 0.0001)

### Append census data
for_regression <- samp_rake_srs$variables %>%
  left_join(bp[,c('county','grow_black_pop')], by='county')

for_regression$wt <- weights(samp_rake_srs)

#specify formula
form <- yes_prop14_di ~ grow_black_pop + popden_county + unemployed_county +
  female + age + homeowner + college + income + as.factor(survey)
out <- glm(form,for_regression,family=binomial(link='logit'),weights = wt)
out2 <- lm(form,for_regression,weights = wt)

outAdj <- clusterRobustSE(out, for_regression$county)
outAdj2 <- clusterRobustSE(out2, for_regression$county)

stargazer(out,out2, style = 'APSR',
          se = list(sqrt(diag(outAdj$vcov)),sqrt(diag(outAdj2$vcov))),
          covariate.labels = c('Growth in Black Pop 1940-1960',
                               'Pop Density County 1940',
                               'Unemployed County 1940',
                               'Female','Age 30-39',
                               'Age 40-49','Age 50-59',
                               'Age 60-69','Age Over 70',
                               'Homeowner','College',
                               'Income','Survey 6406', 'Survey 6407'),
          digits = 2,digits.extra = 1,
          dep.var.labels = c('Prop 14, 1964'),
          type = 'html',
          out='tables/table_J1.html')


# Maps Figure 1 --------------------------------------------------------------------

#load packages
library(readxl)
library(tidyverse)
library(rgeos)
library(rgdal)
library(mgcv)
library(ggthemes)
library(RColorBrewer)

#read in our data
df <- read_csv('clean_data_final_reny_newman.csv')

#####
# shape files
#####

#read in cities shape file
shapefile <- readOGR("pl06_d90_shp-1", layer = "pl06_d90") # Read in shapefile

#fix names
shapefile@data$NAME <- gsub(' city', '',shapefile@data$NAME)
shapefile@data$NAME <- gsub(' CDP', '',shapefile@data$NAME)
shapefile@data$NAME <- gsub(' town', '',shapefile@data$NAME)

#fix those cities that dont match
shapefile@data$NAME[shapefile@data$NAME=='Alturas'] <- 'Altura'
shapefile@data$NAME[shapefile@data$NAME=='Amador City'] <- 'Amador'
shapefile@data$NAME[shapefile@data$NAME=='Atherton town'] <- 'Atherton'
shapefile@data$NAME[shapefile@data$NAME=='Carmel-by-the-Sea'] <- 'Carmel'
shapefile@data$NAME[shapefile@data$NAME=='Lake Elsinore'] <- 'Elsinore'
shapefile@data$NAME[shapefile@data$NAME=='Palos Verdes Estates'] <- 'Palos Verdes Estate'
shapefile@data$NAME[shapefile@data$NAME=='Amador'] <- 'Amador City'
shapefile@data$NAME[shapefile@data$NAME=='Rolling Hills Estates'] <- 'Rolling Hills Estate'
shapefile@data$NAME[shapefile@data$NAME=='St. Helena'] <- 'Saint Helena'
shapefile@data$NAME[shapefile@data$NAME=='San Bernardino'] <- 'San Bernadino'
shapefile@data$NAME[shapefile@data$NAME=='San Buenaventura (Ventura)'] <- 'Ventura'
shapefile@data$NAME[shapefile@data$NAME=='Willows'] <- 'Willow'
shapefile@data$NAME[shapefile@data$NAME=='Yuba City'] <- 'Yuba'
shapefile@data$NAME[shapefile@data$NAME=="El Paso de Robles (Paso Robles)"] <- 'Paso Robles'

# fortify and merge: muni.df is used in ggplot
shapefile@data$id <- rownames(shapefile@data)
region.df <- fortify(shapefile)
region.df <- plyr::join(region.df,shapefile@data, by='id')
region.df <- left_join(region.df, df, by=c('NAME'='city'))

tocut <- unique(region.df$NAME[!region.df$NAME %in% df$city])
region.df <- region.df[-which(region.df$NAME %in% tocut),]

# fill cuts
region.df$black_grow_cut <- NA
region.df$black_grow_cut[region.df$pct_black_grow_1940_1960 >= 0 & region.df$pct_black_grow_1940_1960 <= 0.05] <- '0%-5%'
region.df$black_grow_cut[region.df$pct_black_grow_1940_1960 > 0.05 & region.df$pct_black_grow_1940_1960 <= 0.10] <- '6%-10%'
region.df$black_grow_cut[region.df$pct_black_grow_1940_1960 > 0.10 & region.df$pct_black_grow_1940_1960 <= 0.20] <- '11%-20%'
region.df$black_grow_cut[region.df$pct_black_grow_1940_1960 > 0.20] <- '>20%'
table(region.df$black_grow_cut)

# as factor
region.df$black_grow_cut <- factor(region.df$black_grow_cut,
                                   levels=c(
                                     "0%-5%","6%-10%","11%-20%",">20%"
                                   ))

#add state outline
shp <- readOGR("cb_2016_us_state_500k", layer = "cb_2016_us_state_500k") # Read in world shapefile
shp <- subset(shp,STATEFP == '06')      

# create the map layers
colors <- c('#fee5d9','#fcae91','#fb6a4a','#cb181d','white')

mapla <- ggplot(data=region.df, aes(x=long, y=lat, group=group)) + 
  geom_polygon(aes(fill=black_grow_cut),color='black',size=.025) + 
  theme_map() +
  geom_path(data=shp, aes(x=long, y=lat, group=group),
               fill='white', color='black',alpha=1,size=0.25) +
  scale_fill_manual(values=colors,name='Black Pop Growth\n1940-1960',
                    breaks=c("0%-5%","6%-10%","11%-20%",">20%")) +
  scale_y_continuous(limits=c(33,34.5 )) +
  scale_x_continuous(limits=c(-119,-117)) +
  annotate('text',x=-118.2,y=33.85,label='Compton',size=2.5) +
  annotate('text',x=-117.2,y=34.25,label='Pasadena',size=2.5) +
  annotate('text',x=-117.2,y=33.6,label='Elsinore',size=2.5) 

mapbay <- ggplot(data=region.df, aes(x=long, y=lat, group=group)) + 
  geom_polygon(aes(fill=black_grow_cut),color='black',size=.025) + 
  theme_map() +
  geom_path(data=shp, aes(x=long, y=lat, group=group),
               fill='white', color='black',alpha=1,size=0.25) +
  scale_fill_manual(values=colors,name='Black Growth, 1940-1960') +
  scale_y_continuous(limits=c(37,38.5 )) +
  scale_x_continuous(limits=c(-123, -121.5)) +
  theme(legend.position = 'none') +
  annotate('text',x=-122.3,y=38.05,label='Richmond',size=2.5)+
  annotate('text',x=-122.075,y=38.15,label='Vallejo',size=2.5)+
  annotate('text',x=-121.85,y=38.1,label='Pittsburg',size=2.5)+
  annotate('text',x=-122.2,y=37.8,label='Emeryville',size=2.5)+
  annotate('text',x=-122.4,y=37.85,label='Berkeley',size=2.5) +
  annotate('text',x=-122,y=37.5,label='Menlo Park',size=2.5) 

grob <- grid.arrange(mapla, mapbay,ncol=2)
ggsave(grob,width=8, height=4, dpi=1000, 
       file='figures/figure_1.eps')

mapvalley <- ggplot(data=region.df, aes(x=long, y=lat, group=group)) + 
  geom_polygon(aes(fill=black_grow_cut),color='black',size=.025) + 
  theme_map() +
  geom_polygon(data=shp, aes(x=long, y=lat, group=group),
               fill='white', color='black',alpha=.2,size=0.1) +
  scale_fill_manual(values=colors,name='Black Growth, 1940-1960',
                    breaks=c("0%-5%","6%-10%","11%-20%",">20%")) +
  scale_y_continuous(limits=c(35,37.5 )) +
  scale_x_continuous(limits=c(-121, -118))+
  annotate('text',x=-119.85,y=37,label='Madera',size=2.5)+
  annotate('text',x=-119.4,y=36.6,label='Fowler',size=2.5)+
  annotate('text',x=-119,y=35.5,label='Bakersfield',size=2.5)

ggsave(mapvalley,width=4,height=4,dpi=1000,file='figures/figure_M1.eps')


